logo of company

Recipes for some analyses modifications


Where we present some piece of codes to modify the bioinformatic pipeline

Author: Adrien Taudière

Date: November 25, 2024

See also Easy16S for inspiration in bash.

Forward only pipeline

  1. Replace all the “paired end” area with the following code
  ##> Remove primers
  tar_target(
    cutadapt,
    cutadapt_remove_primers(
      path_to_fastq = here("data/data_raw/rawseq/"),
      primer_fw = fw_primer_sequences,
      folder_output = here("data/data_intermediate/seq_wo_primers/"),
      args_before_cutadapt = "source ~/miniforge3/etc/profile.d/conda.sh && conda activate cutadaptenv && "
    )
  ),
  tar_target(data_raw, {
    cutadapt
    list_fastq_files(path = here::here("data/data_intermediate/seq_wo_primers/"),
                     paired_end = FALSE)
  }),

  ##> Classical dada2 pipeline
  tar_target(data_fnfs, data_raw$fnfs),
  ### Pre-filtered data with low stringency
  tar_target(
    filtered,
    filter_trim(
      output_fw = paste(
        getwd(),
        here("/data/data_intermediate/filterAndTrim_fwd"),
        sep = ""
      ),
      rev = data_fnrs,
      multithread = n_threads,
      compress = TRUE
    )
  ),

  ### Dereplicate fastq files
  tar_target(derep_fs, derepFastq(filtered[[1]]), format = "qs"),
  tar_target(derep_rs, derepFastq(filtered[[2]]), format = "qs"),
  ### Learns the error rates
  tar_target(err_fs, learnErrors(derep_fs, multithread = 4), format = "qs"),
  tar_target(err_rs, learnErrors(derep_rs, multithread = 4), format = "qs"),
  ### Make amplicon sequence variants
  tar_target(ddF, dada(derep_fs, err_fs, multithread = 4), format = "qs"),
  tar_target(ddR, dada(derep_rs, err_rs, multithread = 4), format = "qs"),
  ### Build a a table of ASV x Samples
  tar_target(seq_tab, makeSequenceTable(ddF)),

Add a second taxonomic assignation using a different database or algorithm

  1. Add a new database file (fasta) in data/data_raw/refseq
  2. Copy and complete with good names the two targets below
  3. Rename the targets by using the new name (e.g. data_phyloseq_newDB) instead of data_phyloseq in the subsequent targets
[...]
tar_target(
    name = file_refseq_taxo2,
    command = "data/data_raw/refseq/XXX",
    format = "file"
)

[...]

tar_target(
  data_phyloseq_newDB,
  add_new_taxonomy_pq(
    data_phyloseq,
    file_refseq_taxo2,
    suffix = "PR2",
    taxLevels = c(
      "Kingdom",
      "Supergroup",
      "Division",
      "Subdivision",
      "Class",
      "Order",
      "Family",
      "Genus",
      "Species"
    )
  )
)

Filter by % sequence (whith blast)

tar_target(d_blast,
    filter_taxa_blast(
      data_phyloseq,
      fasta_for_db = paste0(here::here(), "/", file_refseq_taxo),
      nproc = 4
    )
)

Add funguild informations for Fungi

tar_target(d_funguild, 
  MiscMetabar::add_funguild_info(data_phyloseq)
)

Add Protax informations for Bacteria